#geographic investigation
#18 September 2020
library(vcfR)
## Warning: package 'vcfR' was built under R version 3.5.2
##
## ***** *** vcfR *** *****
## This is vcfR 1.9.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(ggplot2)
library(gridExtra)
library(ggridges)
## Warning: package 'ggridges' was built under R version 3.5.2
library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.5.2
##
## /// adegenet 2.1.3 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(StAMPP)
## Warning: package 'StAMPP' was built under R version 3.5.2
## Loading required package: pegas
## Warning: package 'pegas' was built under R version 3.5.2
## Loading required package: ape
## Warning: package 'ape' was built under R version 3.5.2
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following object is masked from 'package:ade4':
##
## amova
## The following object is masked from 'package:vcfR':
##
## getINFO
library(gplots)
## Warning: package 'gplots' was built under R version 3.5.2
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(scatterpie)
library(geosphere)
## Warning: package 'geosphere' was built under R version 3.5.2
library(ggtree)
## Warning: package 'ggtree' was built under R version 3.5.2
## ggtree v1.14.6 For help: https://guangchuangyu.github.io/software/ggtree
##
## If you use ggtree in published research, please cite the most appropriate paper(s):
##
## - Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
##
## - Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, accepted. doi: 10.1093/molbev/msy194
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
##
## rotate
#read in locality info for samples
locs<-read.csv("~/Desktop/aph.data/rad.sampling.csv")
#fix mislabeled sample
locs$id<-as.character(locs$id)
locs$id[locs$id == "A_californica_334171"]<-"A_woodhouseii_334171"
#read in vcf
vcfR <- read.vcfR("~/Desktop/aph.data/filtered.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 11273
## column count: 104
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 11273
## Character matrix gt cols: 104
## skip: 0
## nrows: 11273
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant: 11273
## All variants processed
#fix mislabeled sample
colnames(vcfR@gt)[colnames(vcfR@gt) == "A_californica_334171"]<-"A_woodhouseii_334171"
#convert to genlight
gen<- vcfR2genlight(vcfR)
#reorder gen
gen<-gen[c(1:19,21:56,20,57:95),]
#subset locs to include only samples that passed filtering, and have been retained in the vcf
locs<-locs[locs$id %in% gen@ind.names,]
locs<-locs[c(1:19,21:56,20,57:95),]
locs[56,5]<-"woodhouseii"
table(locs$species)
##
## californica coerulescens insularis sumichrasti woodhouseii
## 31 6 8 10 40
table(locs$subspecies)
##
## californica caurina coerulescens cyanotis grisea hypoleuca
## 5 4 6 9 6 4
## immanis insularis nevadae obscura oocleptica remota
## 5 8 10 8 5 5
## sumichrasti texana woodhouseii
## 5 7 8
#split df by species
#spec.dfs<-split(locs, locs$species)
#init sampling.df which will be a df of samples grouped by unique lat/long
#sampling.df<-data.frame(NULL)
#for (i in names(spec.dfs)){
# samps<-spec.dfs[[i]] %>% group_by(decimallatitude, decimallongitude) %>% summarize(count=n())
# df<-cbind(rep(i, times=nrow(samps)), samps)
# sampling.df<-as.data.frame(rbind(sampling.df, df))
#}
#fix colnames
#colnames(sampling.df)<-c("species","lat","long","count")
#make full map
pac<-map_data("world")
#
#ggplot()+
# geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
# coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
# geom_point(data = sampling.df, aes(x = long, y = lat, col=species, size=count), alpha =.9, show.legend=TRUE) +
# theme_classic()+
# scale_color_manual(values=funky(5))+
# scale_size_continuous(range = c(2,8))+
# guides(colour = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic")),
# size = guide_legend(nrow = 1, order = 2))+
# theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01))
#plot map colored by species
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
geom_point(data = locs, aes(x = decimallongitude, y = decimallatitude, col=species), alpha =.9, size=3, show.legend=TRUE,
position = position_jitter(width = 0.3, height = 0.3)) +
theme_classic()+
guides(colour = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic")),
size = guide_legend(nrow = 1, order = 2))+
theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),legend.key=element_blank(),legend.background=element_blank())

map of distributions


knitr::include_graphics(c("/Users/devder/Desktop/bic.png"))

#Do dapc and choose groups
#K=5
#assign samples to the number of groups present in popmap, retain all PCAs
#grp<-find.clusters(gen)
#set manually the values I chose based on looking at the scree plots
grp<-find.clusters(gen, n.pca=94, n.clust=5)
#run dapc, retain all discriminant axes, and enough PC axes to explain 75% of variance
#dapc1<-dapc(gen, grp$grp)
#set manually the values I chose from scree plots
dapc1<-dapc(gen, grp$grp, n.da = 5, n.pca =6)
#plot compoplot
compoplot(dapc1, legend=FALSE, show.lab =TRUE, cex.names=.4)

#
post<-as.data.frame(dapc1$posterior)
post$id<-rownames(post)
plot.pies<-merge(post, locs, by= "id")
#
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
geom_scatterpie(aes(x=decimallongitude, y=decimallatitude, group = decimallatitude),
data = plot.pies, cols = colnames(plot.pies[,c(2:6)]))+
theme_classic()

#K=6
#set manually the values I chose based on looking at the scree plots
grp<-find.clusters(gen, n.pca=94, n.clust=6)
#set manually the values I chose from scree plots
dapc1<-dapc(gen, grp$grp, n.da = 6, n.pca =7)
#plot compoplot
compoplot(dapc1, legend=FALSE, show.lab =TRUE, cex.names=.4)

#
post<-as.data.frame(dapc1$posterior)
post$id<-rownames(post)
plot.pies<-merge(post, locs, by= "id")
#
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
geom_scatterpie(aes(x=decimallongitude, y=decimallatitude, group = decimallatitude),
data = plot.pies, cols = colnames(plot.pies[,c(2:7)]))+
theme_classic()

#K=7
#set manually the values I chose based on looking at the scree plots
grp<-find.clusters(gen, n.pca=94, n.clust=7)
#set manually the values I chose from scree plots
dapc1<-dapc(gen, grp$grp, n.da = 7, n.pca =8)
#plot compoplot
compoplot(dapc1, legend=FALSE, show.lab =TRUE, cex.names=.4)

#
post<-as.data.frame(dapc1$posterior)
post$id<-rownames(post)
plot.pies<-merge(post, locs, by= "id")
#
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
geom_scatterpie(aes(x=decimallongitude, y=decimallatitude, group = decimallatitude),
data = plot.pies, cols = colnames(plot.pies[,c(2:8)]))+
theme_classic()

#K=8
#set manually the values I chose based on looking at the scree plots
grp<-find.clusters(gen, n.pca=94, n.clust=8)
#set manually the values I chose from scree plots
dapc1<-dapc(gen, grp$grp, n.da = 8, n.pca =9)
#plot compoplot
compoplot(dapc1, legend=FALSE, show.lab =TRUE, cex.names=.4)

#
post<-as.data.frame(dapc1$posterior)
post$id<-rownames(post)
plot.pies<-merge(post, locs, by= "id")
#
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
geom_scatterpie(aes(x=decimallongitude, y=decimallatitude, group = decimallatitude),
data = plot.pies, cols = colnames(plot.pies[,c(2:9)]))+
theme_classic()

#K=9
#set manually the values I chose based on looking at the scree plots
grp<-find.clusters(gen, n.pca=94, n.clust=9)
#set manually the values I chose from scree plots
dapc1<-dapc(gen, grp$grp, n.da = 9, n.pca =10)
#plot compoplot
compoplot(dapc1, legend=FALSE, show.lab =TRUE, cex.names=.4)

#
post<-as.data.frame(dapc1$posterior)
post$id<-rownames(post)
plot.pies<-merge(post, locs, by= "id")
#
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
geom_scatterpie(aes(x=decimallongitude, y=decimallatitude, group = decimallatitude),
data = plot.pies, cols = colnames(plot.pies[,c(2:10)]))+
theme_classic()

#calculate divergence btwn samples
gen@pop<-grp$grp
inds<-stamppNeisD(gen, pop = FALSE)
#plot tree colored by subspecies
tree <- nj(inds)
#make manual color vector
col.vec<-as.character(locs$subspecies)
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
n = 15
cols = gg_color_hue(n)
j=1
#convert subspecies to colors
for (i in levels(as.factor(col.vec))){
col.vec[col.vec == i]<-cols[j]
j<-j+1
}
col.vec<-col.vec[c(1:19,57,20:56,58:95)]
#plot map colored by subspecies
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) +
geom_point(data = locs, aes(x = decimallongitude, y = decimallatitude, col=subspecies), alpha =.9, size=3, show.legend=TRUE,
position = position_jitter(width = 0.3, height = 0.3)) +
theme_classic()+
guides(colour = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic")),
size = guide_legend(nrow = 1, order = 2))+
theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),legend.key=element_blank(),legend.background=element_blank())

#plot ggtree
ggtree(tree)+
geom_tippoint(color=col.vec)+
geom_tiplab(cex=2)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `mutate_()` is deprecated as of dplyr 0.7.0.
## Please use `mutate()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

#subset genlight to only western scrub jays
wood.gen<-new("genlight",(as.matrix(gen)[locs$species == "californica" | locs$species == "woodhouseii" | locs$species == "sumichrasti", ]))
#subset loc info
wood.locs<-locs[locs$species == "californica" | locs$species == "woodhouseii" | locs$species == "sumichrasti", ]
wood.locs<-droplevels(wood.locs)
#plot map of woodfornia colored by subspecies
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-120, -95), ylim = c(15, 45)) +
geom_point(data = wood.locs, aes(x = decimallongitude, y = decimallatitude, col=subspecies), alpha =.9, size=3, show.legend=TRUE,
position = position_jitter(width = 0.3, height = 0.3)) +
theme_classic()+
guides(colour = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic")),
size = guide_legend(nrow = 1, order = 2))+
theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),legend.key=element_blank(),legend.background=element_blank())

#PCA of wood
wood.pca<-glPca(wood.gen, nf=6)
#pull pca scores out of df
wood.pca.scores<-as.data.frame(wood.pca$scores)
wood.pca.scores$subspecies<-wood.locs$subspecies
#ggplot color by subspecies
ggplot(wood.pca.scores, aes(x=PC1, y=PC2, color=subspecies)) +
geom_point(cex = 2)

#ggplot PC3 and PC4
ggplot(wood.pca.scores, aes(x=PC3, y=PC4, color=subspecies)) +
geom_point(cex = 2)

#subset genlight to only california scrub jays
cali.gen<-new("genlight",(as.matrix(gen)[locs$species == "californica", ]))
#subset loc info
cali.locs<-locs[locs$species == "californica", ]
#PCA of cali
cali.pca<-glPca(cali.gen, nf=6)
#pull pca scores out of df
cali.pca.scores<-as.data.frame(cali.pca$scores)
#plot map of california colored by subspecies
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-125, -105), ylim = c(20, 50)) +
geom_point(data = cali.locs, aes(x = decimallongitude, y = decimallatitude, col=subspecies), alpha =.9, size=3, show.legend=TRUE,
position = position_jitter(width = 0.3, height = 0.3)) +
theme_classic()+
guides(colour = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic")),
size = guide_legend(nrow = 1, order = 2))+
theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),legend.key=element_blank(),legend.background=element_blank())

#ggplot color by subspecies
ggplot(cali.pca.scores, aes(x=PC1, y=PC2, color=cali.locs$subspecies)) +
geom_point(cex = 2)

#test IBD
cali.gen@pop<-cali.locs$subspecies
inds<-stamppNeisD(cali.gen, pop = FALSE)
inds<-as.dist(inds)
#
cali.coords<-data.frame(Long=cali.locs$decimallongitude, Lat=cali.locs$decimallatitude)
#indica make dist matrix convert to km distance
cali.Dgeo <- as.dist(distm(cali.coords, fun=distGeo))
cali.Dgeo<-cali.Dgeo/1000
#run ibd test
IBD <- mantel.randtest(cali.Dgeo,inds)
IBD
## Monte-Carlo test
## Call: mantel.randtest(m1 = cali.Dgeo, m2 = inds)
##
## Observation: 0.4758088
##
## Based on 999 replicates
## Simulated p-value: 0.001
## Alternative hypothesis: greater
##
## Std.Obs Expectation Variance
## 4.513362570 -0.004899689 0.011343917
plot(cali.Dgeo,inds, pch=20,cex=1.5, ylab = "Nei's Genetic Distance", xlab = "Geographic Distance (km)")
abline(lm(inds~cali.Dgeo), lty = 2)

#subset genlight to only woodhouse scrub jays
woodhouse.gen<-new("genlight",(as.matrix(gen)[locs$species == "woodhouseii" | locs$species == "sumichrasti", ]))
#subset loc info
woodhouse.locs<-locs[locs$species == "woodhouseii" | locs$species == "sumichrasti", ]
#PCA of woodhouse
woodhouse.pca<-glPca(woodhouse.gen, nf=6)
#pull pca scores out of df
woodhouse.pca.scores<-as.data.frame(woodhouse.pca$scores)
#plot map of woodhousefornia colored by subspecies
ggplot()+
geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey", col="black", cex=.1)+
coord_sf(xlim = c(-125, -105), ylim = c(20, 50)) +
geom_point(data = woodhouse.locs, aes(x = decimallongitude, y = decimallatitude, col=subspecies), alpha =.9, size=3, show.legend=TRUE, position = position_jitter(width = 0.3, height = 0.3)) +
theme_classic()+
guides(colour = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic")),
size = guide_legend(nrow = 1, order = 2))+
theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),legend.key=element_blank(),legend.background=element_blank())

#ggplot color by subspecies
ggplot(woodhouse.pca.scores, aes(x=PC1, y=PC2, color=woodhouse.locs$subspecies)) +
geom_point(cex = 2)

#test IBD
wood.gen@pop<-wood.locs$subspecies
inds<-stamppNeisD(wood.gen, pop = FALSE)
inds<-as.dist(inds)
#
wood.coords<-data.frame(Long=wood.locs$decimallongitude, Lat=wood.locs$decimallatitude)
#indica make dist matrix convert to km distance
wood.Dgeo <- as.dist(distm(wood.coords, fun=distGeo))
wood.Dgeo<-wood.Dgeo/1000
#run ibd test
IBD <- mantel.randtest(wood.Dgeo,inds)
IBD
## Monte-Carlo test
## Call: mantel.randtest(m1 = wood.Dgeo, m2 = inds)
##
## Observation: 0.6117445
##
## Based on 999 replicates
## Simulated p-value: 0.001
## Alternative hypothesis: greater
##
## Std.Obs Expectation Variance
## 2.166090e+01 1.323052e-04 7.972590e-04
plot(wood.Dgeo,inds, pch=20,cex=1.5, ylab = "Nei's Genetic Distance", xlab = "Geographic Distance (km)")
abline(lm(inds~wood.Dgeo), lty = 2)

#calc heterozygosity and make df for plotting
gen.mat<-as.matrix(gen)
loci<-rowSums(is.na(gen.mat) == FALSE)
het<-rowSums(gen.mat == 1, na.rm = TRUE)/loci
het.df<-data.frame(id=locs$id,subspecies=locs$subspecies,het=het)
#plot heterozygosity as violin plots for each subspecies
ggplot(het.df, aes(x=subspecies, y=het)) +
geom_violin(trim=FALSE)+
geom_dotplot(binaxis='y', stackdir='center', dotsize = .5)+
theme_classic()+
theme(axis.text.x = element_text(face = "italic", angle = 45, hjust = 1))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
